In [1]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
# Import train_test_split
from sklearn.cross_validation import train_test_split
from collections import defaultdict
from wordcloud import WordCloud


# magic word for producing visualizations in notebook
%matplotlib inline
C:\Users\Akshat\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning:

This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

Business and Data Understanding

What is the data like?
How many numerical and string dtypes do we have?
What is the corelation between Installs, reviews, price, free and paid apps?
What are the categorical data?
In [2]:
df_playstore = pd.read_csv('googleplaystore.csv',delimiter=',')
df_reviews = pd.read_csv('googleplaystore_user_reviews.csv',delimiter=',')
df_playstore.head()
Out[2]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone Art & Design August 1, 2018 1.2.4 4.0.3 and up
3 Sketch - Draw & Paint ART_AND_DESIGN 4.5 215644 25M 50,000,000+ Free 0 Teen Art & Design June 8, 2018 Varies with device 4.2 and up
4 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3 967 2.8M 100,000+ Free 0 Everyone Art & Design;Creativity June 20, 2018 1.1 4.4 and up
In [3]:
print (df_playstore.shape)
print (df_playstore.info())
print (df_playstore.columns)
(10841, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10841 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null object
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10840 non-null object
Genres            10841 non-null object
Last Updated      10841 non-null object
Current Ver       10833 non-null object
Android Ver       10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
None
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

Data Understanding

Find unique values and counts of it per column to analyze the data preparation tasks ahead. Find which kinds of values are found and dtypes of each column. 
In [4]:
for col in df_playstore.columns:
    print (df_playstore[col].value_counts())
ROBLOX                                                9
CBS Sports App - Scores, News, Stats & Watch Live     8
8 Ball Pool                                           7
Duolingo: Learn Languages Free                        7
Candy Crush Saga                                      7
ESPN                                                  7
Bleacher Report: sports news, scores, & highlights    6
Nick                                                  6
Zombie Catchers                                       6
Temple Run 2                                          6
Sniper 3D Gun Shooter: Free Shooting Games - FPS      6
Subway Surfers                                        6
Helix Jump                                            6
Bowmasters                                            6
slither.io                                            6
Bubble Shooter                                        6
Viber Messenger                                       5
Calorie Counter - MyFitnessPal                        5
Netflix                                               5
BeautyPlus - Easy Photo Editor & Selfie Camera        5
Yahoo Fantasy Sports - #1 Rated Fantasy App           5
Angry Birds Classic                                   5
Flow Free                                             5
TripAdvisor Hotels Flights Restaurants Attractions    5
Skyscanner                                            5
Farm Heroes Saga                                      5
MLB At Bat                                            5
Granny                                                5
eBay: Buy & Sell this Summer - Discover Deals Now!    5
theScore: Live Sports Scores, News, Stats & Videos    5
                                                     ..
Night: DU Launcher Theme                              1
Woody Puzzle                                          1
Faustop Sounds                                        1
GolfLogix GPS + Putt Breaks                           1
Mupen64+AE FREE (N64 Emulator)                        1
Chess School for Beginners                            1
Wedding Countdown Widget                              1
5 Minute Clinical Consult 2019 - #1 for 25 years      1
Moodpath - Depression & Anxiety Test                  1
OSRAM BT Control                                      1
DT future1 cam                                        1
Yuzu eReader                                          1
ck-modelcars-UK Shop                                  1
Latest Barcelona News 24h                             1
Florida Map offline                                   1
Listen and learn English in seven days                1
TN Patta Chitta EC Info                               1
Crime Wars S. Andreas                                 1
Deposit Calculator FD & RD                            1
BR                                                    1
Map and Router Badge                                  1
BD Live Call                                          1
do                                                    1
Qapital - Save Small. Live Large                      1
Official AL Fishing & Hunting                         1
Dictionary.com: Find Definitions for English Words    1
Sexy Hot Detector Prank                               1
Bible                                                 1
My Recipes Cookbook : RecetteTek                      1
BombSquad Remote                                      1
Name: App, Length: 9660, dtype: int64
FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: Category, dtype: int64
4.4     1109
4.3     1076
4.5     1038
4.2      952
4.6      823
4.1      708
4.0      568
4.7      499
3.9      386
3.8      303
5.0      274
3.7      239
4.8      234
3.6      174
3.5      163
3.4      128
3.3      102
4.9       87
3.0       83
3.1       69
3.2       64
2.9       45
2.8       42
2.6       25
2.7       25
2.5       21
2.3       20
2.4       19
1.0       16
2.2       14
1.9       13
2.0       12
2.1        8
1.7        8
1.8        8
1.6        4
1.4        3
1.5        3
1.2        1
19.0       1
Name: Rating, dtype: int64
0          596
1          272
2          214
3          175
4          137
5          108
6           97
7           90
8           74
9           65
10          64
12          60
11          52
13          49
17          48
19          41
14          41
16          35
20          35
21          35
15          31
25          30
24          30
30          30
38          29
18          27
22          26
23          25
27          25
33          24
          ... 
7300         1
4114         1
10562        1
693          1
111741       1
22773        1
1036         1
358817       1
94910        1
28633        1
28250        1
44233        1
221858       1
8119151      1
4260         1
24265        1
39661        1
2129707      1
6105         1
4450890      1
2953886      1
308944       1
9013         1
313403       1
1116393      1
14810        1
25427        1
896118       1
4300936      1
86172        1
Name: Reviews, Length: 6002, dtype: int64
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
15M                    184
17M                    160
19M                    154
16M                    149
26M                    149
25M                    143
20M                    139
21M                    138
24M                    136
10M                    136
18M                    133
23M                    117
22M                    114
29M                    103
27M                     97
28M                     95
30M                     84
33M                     79
3.3M                    77
37M                     76
35M                     72
31M                     70
2.9M                    69
2.3M                    68
2.5M                    68
                      ... 
308k                     1
818k                     1
1,000+                   1
121k                     1
619k                     1
169k                     1
25k                      1
221k                     1
801k                     1
475k                     1
600k                     1
460k                     1
636k                     1
421k                     1
411k                     1
695k                     1
444k                     1
980k                     1
597k                     1
383k                     1
930k                     1
688k                     1
164k                     1
920k                     1
975k                     1
283k                     1
992k                     1
313k                     1
902k                     1
778k                     1
Name: Size, Length: 462, dtype: int64
1,000,000+        1579
10,000,000+       1252
100,000+          1169
10,000+           1054
1,000+             907
5,000,000+         752
100+               719
500,000+           539
50,000+            479
5,000+             477
100,000,000+       409
10+                386
500+               330
50,000,000+        289
50+                205
5+                  82
500,000,000+        72
1+                  67
1,000,000,000+      58
0+                  14
Free                 1
0                    1
Name: Installs, dtype: int64
Free    10039
Paid      800
0           1
Name: Type, dtype: int64
0          10040
$0.99        148
$2.99        129
$1.99         73
$4.99         72
$3.99         63
$1.49         46
$5.99         30
$2.49         26
$9.99         21
$6.99         13
$399.99       12
$14.99        11
$4.49          9
$7.99          7
$29.99         7
$3.49          7
$24.99         7
$5.49          6
$19.99         6
$6.49          5
$8.99          5
$11.99         5
$12.99         5
$16.99         3
$2.00          3
$10.00         3
$1.00          3
$13.99         2
$79.99         2
           ...  
$109.99        1
$37.99         1
$1.26          1
$4.85          1
$1.61          1
$394.99        1
$74.99         1
$4.60          1
$30.99         1
$28.99         1
$1.50          1
$4.29          1
$4.77          1
$2.90          1
$1.97          1
$89.99         1
$3.08          1
$1.04          1
$18.99         1
$400.00        1
$2.95          1
$1.20          1
$379.99        1
$25.99         1
$2.59          1
$19.90         1
$46.99         1
$3.88          1
$4.80          1
$299.99        1
Name: Price, Length: 93, dtype: int64
Everyone           8714
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: Content Rating, dtype: int64
Tools                                  842
Entertainment                          623
Education                              549
Medical                                463
Business                               460
Productivity                           424
Sports                                 398
Personalization                        392
Communication                          387
Lifestyle                              381
Finance                                366
Action                                 365
Health & Fitness                       341
Photography                            335
Social                                 295
News & Magazines                       283
Shopping                               260
Travel & Local                         257
Dating                                 234
Books & Reference                      231
Arcade                                 220
Simulation                             200
Casual                                 193
Video Players & Editors                173
Puzzle                                 140
Maps & Navigation                      137
Food & Drink                           127
Role Playing                           109
Strategy                               107
Racing                                  98
                                      ... 
Art & Design;Action & Adventure          2
Card;Action & Adventure                  2
Casual;Music & Video                     2
Puzzle;Creativity                        2
Books & Reference;Education              2
Strategy;Action & Adventure              2
Books & Reference;Creativity             1
Board;Pretend Play                       1
Music & Audio;Music & Video              1
Lifestyle;Pretend Play                   1
Lifestyle;Education                      1
Racing;Pretend Play                      1
Arcade;Pretend Play                      1
Role Playing;Brain Games                 1
Health & Fitness;Action & Adventure      1
Trivia;Education                         1
Travel & Local;Action & Adventure        1
Health & Fitness;Education               1
Strategy;Creativity                      1
Communication;Creativity                 1
Strategy;Education                       1
Tools;Education                          1
Entertainment;Education                  1
Comics;Creativity                        1
Puzzle;Education                         1
Role Playing;Education                   1
February 11, 2018                        1
Card;Brain Games                         1
Parenting;Brain Games                    1
Adventure;Brain Games                    1
Name: Genres, Length: 120, dtype: int64
August 3, 2018        326
August 2, 2018        304
July 31, 2018         294
August 1, 2018        285
July 30, 2018         211
July 25, 2018         164
July 26, 2018         161
August 6, 2018        158
July 27, 2018         151
July 24, 2018         148
July 23, 2018         127
July 19, 2018         126
July 16, 2018         126
July 18, 2018         123
July 11, 2018         106
August 4, 2018        105
July 12, 2018         103
July 5, 2018           93
July 17, 2018          92
July 3, 2018           90
July 9, 2018           89
July 20, 2018          88
July 13, 2018          81
May 24, 2018           69
June 27, 2018          63
July 6, 2018           63
June 26, 2018          60
June 25, 2018          56
May 25, 2018           56
June 13, 2018          54
                     ... 
August 9, 2014          1
February 27, 2016       1
July 14, 2015           1
January 2, 2017         1
September 6, 2017       1
October 1, 2015         1
June 22, 2017           1
August 31, 2015         1
April 19, 2016          1
December 5, 2016        1
March 24, 2017          1
June 4, 2013            1
June 11, 2017           1
October 7, 2014         1
June 26, 2013           1
July 10, 2011           1
January 17, 2012        1
September 26, 2015      1
December 26, 2013       1
September 22, 2014      1
December 26, 2014       1
February 18, 2013       1
January 15, 2014        1
September 25, 2013      1
July 30, 2012           1
June 27, 2012           1
December 22, 2014       1
April 17, 2017          1
March 25, 2014          1
March 8, 2015           1
Name: Last Updated, Length: 1378, dtype: int64
Varies with device    1459
1.0                    809
1.1                    264
1.2                    178
2.0                    151
1.3                    145
1.0.0                  136
1.0.1                  119
1.4                     88
1.5                     81
1.0.2                   80
1.6                     65
1.0.3                   62
2.1                     61
3.0                     59
1.0.4                   58
1.7                     53
2.0.0                   51
1.0.5                   50
1.1.0                   49
1.1.1                   48
1.0.6                   47
1.2.1                   46
1.2.0                   43
1.8                     42
4.0                     40
1.9                     37
2.3.2                   35
1.0.7                   34
2.4                     34
                      ... 
2.47.2                   1
3.98                     1
20170828                 1
1.5.13-3598              1
7.6                      1
4.0.0.427                1
1.998                    1
5.9.1                    1
1.1.13                   1
6.0.0.0                  1
5.2.4(881)               1
acremotecontrol18        1
67                       1
Version 1.20             1
4.4.1255                 1
2.5.18                   1
1.2.0.27                 1
1.7.0m                   1
3.11.1                   1
1.1.16                   1
1.6.8                    1
3.4.06                   1
9.1.284                  1
7.1.34.28                1
1.6.0.0                  1
2018.04.02.00            1
1.14.1                   1
6.00                     1
1.48.0                   1
0.9.16                   1
Name: Current Ver, Length: 2832, dtype: int64
4.1 and up            2451
4.0.3 and up          1501
4.0 and up            1375
Varies with device    1362
4.4 and up             980
2.3 and up             652
5.0 and up             601
4.2 and up             394
2.3.3 and up           281
2.2 and up             244
4.3 and up             243
3.0 and up             241
2.1 and up             134
1.6 and up             116
6.0 and up              60
7.0 and up              42
3.2 and up              36
2.0 and up              32
5.1 and up              24
1.5 and up              20
4.4W and up             12
3.1 and up              10
2.0.1 and up             7
8.0 and up               6
7.1 and up               3
1.0 and up               2
4.0.3 - 7.1.1            2
5.0 - 8.0                2
7.0 - 7.1.1              1
5.0 - 7.1.1              1
5.0 - 6.0                1
2.2 - 7.1.1              1
4.1 - 7.1.1              1
Name: Android Ver, dtype: int64
In [5]:
df_reviews.head()
Out[5]:
App Translated_Review Sentiment Sentiment_Polarity Sentiment_Subjectivity
0 10 Best Foods for You I like eat delicious food. That's I'm cooking ... Positive 1.00 0.533333
1 10 Best Foods for You This help eating healthy exercise regular basis Positive 0.25 0.288462
2 10 Best Foods for You NaN NaN NaN NaN
3 10 Best Foods for You Works great especially going grocery store Positive 0.40 0.875000
4 10 Best Foods for You Best idea us Positive 1.00 0.300000

Prepare and clean Data

Define -
    Data cleaning steps- df_playstore
    1. Drop duplicates in 'App' column googleplaystore
    2. Remove "M" & "k" from Size column. Convert value to MB and dtype to float."Varies with device" would be NaN. Replace 1,000+ with 1.
    3. Remove "+" from Installs column. Convert to int.
    4. Remove single row in playstore dataframe where Category = 1.9.
    5. Replace zero 0 with Free in column Type. Convert to float
    6. Remove $ sign from Price column and convert to float
    7. Genres - split on ; keep second but remove the first part
    8. Last Updated - change dtype to datetime and time 
    9. Current Ver - do nothing as of now.
    Android Ver - split and remove " and up" string... If time allows.!![image.png](attachment:image.png)
In [6]:
df_playstore.columns
Out[6]:
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')
In [7]:
# Clean data for App column
df_playstore.drop_duplicates(['App'],keep='first',inplace=True)
In [8]:
# Drop row from Category column where df_playstore['Category'] == '1.9'
#Convert the index value to list then pass it to drop function.
df_playstore.drop(list(df_playstore.loc[df_playstore['Category'] == '1.9'].index),axis=0,inplace=True)
In [9]:
# Clean data for Size column
df_playstore['Size'].replace('Varies with device',np.nan,inplace=True)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if 'M' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
df_playstore['Size'].replace('1,000','1',inplace=True)
df_playstore['Size'] = df_playstore['Size'].astype('float')
df_playstore['Size'].dtype
Out[9]:
dtype('float64')
In [10]:
# Clean data for Installs column
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x : str(x).replace(',','') if ',' in str(x) else x)
df_playstore['Installs'].replace('Free','0',inplace=True)
df_playstore['Installs'] = df_playstore['Installs'].astype('int')
df_playstore['Installs'].dtype
Out[10]:
dtype('int32')
In [11]:
# Clean data for Ratings column
df_playstore['Rating'] = df_playstore['Rating'].astype('float')
In [12]:
# Change dtype for Reviews column
df_playstore['Reviews'] = df_playstore['Reviews'].astype('int')
df_playstore['Reviews'].dtype
Out[12]:
dtype('int32')
In [13]:
#Data cleaning and wrangling for Price column
df_playstore['Price'] = df_playstore['Price'].map(lambda x : str(x).replace('$','') if '$' in str(x) else x)
df_playstore.rename(columns={'Price':'Price_in_dollars'},inplace=True)
df_playstore['Price_in_dollars'] = df_playstore['Price_in_dollars'].astype('float')
print(df_playstore['Price_in_dollars'].dtype)
float64
In [14]:
#Split values of Genres columns by ';' and group appropriatly
print (df_playstore['Genres'].shape)
df_playstore['Genres'] = df_playstore['Genres'].map(lambda x : str(x).rsplit(';')[0] if ';' in str(x) else x)
print (df_playstore['Genres'].shape[0])
print (df_playstore['Genres'].dtype)
(9659,)
9659
object
In [15]:
# Change dtype of Last Updated to date time.
df_playstore['Last Updated'] = pd.to_datetime(df_playstore['Last Updated'],dayfirst=True)

Data preparing - removing NaNs and missing values from the dataset.

    1. Find and replace missing values columnwise
    2. Find and replace missing values rowwise.
In [16]:
#Find and visualize missing values columnwise.
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x17c5b28dc50>
In [17]:
#Fill NaN values with mean of respective columns. Nan values are replaced with respective mean values because it would 
#introduce minimum bias in the data.

#np.where(np.isnan(df_playstore))
df_playstore['Rating'].fillna(df_playstore['Rating'].mean(),inplace=True)
df_playstore['Size'].fillna(df_playstore['Size'].mean(),inplace=True)
In [18]:
#Plot after replacing Nan values with respective mean() values of that column. 
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x17c5b30fc18>
In the above chart there are three columns - Type, Android Version and Current Version which still has NaN or missing values  but it would not be replaced with any values because I would not use these columns for modelling or analysis purposes. I shall drop these columns from my analysis dataset.

Type column would be one hot encoded later in the stage.
In [19]:
df_playstore.head()
Out[19]:
App Category Rating Reviews Size Installs Type Price_in_dollars Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19.0 10000 Free 0.0 Everyone Art & Design 2018-01-07 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14.0 500000 Free 0.0 Everyone Art & Design 2018-01-15 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7 5000000 Free 0.0 Everyone Art & Design 2018-08-01 1.2.4 4.0.3 and up
3 Sketch - Draw & Paint ART_AND_DESIGN 4.5 215644 25.0 50000000 Free 0.0 Teen Art & Design 2018-06-08 Varies with device 4.2 and up
4 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3 967 2.8 100000 Free 0.0 Everyone Art & Design 2018-06-20 1.1 4.4 and up
In [20]:
#find and drop rows (axis=1) with missing values
missing_data = df_playstore.isnull().sum(axis=1)
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
print (df_playstore.iloc[7333])
App                    Kingdom in Chaos
Category                         FAMILY
Rating                              4.3
Reviews                            5623
Size                                 28
Installs                         100000
Type                               Free
Price_in_dollars                      0
Content Rating             Everyone 10+
Genres                     Role Playing
Last Updated        2015-12-21 00:00:00
Current Ver                       1.0.5
Android Ver                  2.3 and up
Name: 8419, dtype: object
   In above step, I have not deleted  rows with missing values as the  rows still have important information for many columns. All numeric columns do have missing values. I would handle NAN values during scaling and modelling if I face any errors. 

Data Understanding-

Once the data is cleaned and usable, plot scatter and bar chart to visualize data. Data understanding is combined with data cleaning preparing activities before putting it for scaling and modelling. 
In [21]:
#Scatterplot Matrix from seaborn
x = df_playstore['Rating'].dropna()
y = df_playstore['Size'].dropna()
z = df_playstore['Installs'][df_playstore.Installs!=0].dropna()
p = df_playstore['Reviews'][df_playstore.Reviews!=0].dropna()
t = df_playstore['Type'].dropna()
price = df_playstore['Price_in_dollars']

p = sns.pairplot(pd.DataFrame(list(zip(x, y, np.log(z), np.log10(p), t, price)), 
                        columns=['Rating','Size', 'Installs', 'Reviews', 'Type', 'Price_in_dollars']), hue='Type', palette="Set2")

Data Understanding

Plot categorical column 'Category' to visualize number of Apps available per category in the dataset. This chart would find its place in the blog.
In [22]:
print (len(df_playstore['Category'].value_counts()))
df_playstore['Category'].value_counts().plot.bar(title = 'Number of Categorical Apps', figsize=(30,20),fontsize=18)
33
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x17c5c031780>

Data Preparing -

Data Preprocessing- One hot encode and Label encode the categorical columns. Then scale down all numerical columns using standard scaler function.
Using getdummies on 'Type' column would add 0s and 1s values in two columns which would into add a lot of bias. After scaling, the values would still easily be between 0 and 1. This is a simple categorical column and do not have lots of labels so getdummies is best suitable here.

Using Label encoding on Category column would mark all 33 different columns with unique ids. This would add some bias though but scaling should normalize the values. It would be interesting to see how Linear Regression model weighs on Category column.

Columns like Genres and versions are not useful for modelling hence I would drop them before scaling later.

Reference links- https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
In [23]:
Categories = df_playstore['Category'].unique()
Categories
Out[23]:
array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION'],
      dtype=object)
In [24]:
df_playstore.head()
Out[24]:
App Category Rating Reviews Size Installs Type Price_in_dollars Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19.0 10000 Free 0.0 Everyone Art & Design 2018-01-07 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14.0 500000 Free 0.0 Everyone Art & Design 2018-01-15 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7 5000000 Free 0.0 Everyone Art & Design 2018-08-01 1.2.4 4.0.3 and up
3 Sketch - Draw & Paint ART_AND_DESIGN 4.5 215644 25.0 50000000 Free 0.0 Teen Art & Design 2018-06-08 Varies with device 4.2 and up
4 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3 967 2.8 100000 Free 0.0 Everyone Art & Design 2018-06-20 1.1 4.4 and up
In [25]:
# getdummies encoding for Type column
df_playstore_temp=pd.get_dummies(df_playstore,prefix='Type',columns=['Type'])
In [26]:
values = np.array(df_playstore_temp['Category'].unique())
values
Out[26]:
array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION'],
      dtype=object)
In [27]:
# #label encoding Categorical values
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(values)
# print (integer_encoded)
In [28]:
# #One hot encoding categorical values
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded),1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print (onehot_encoded)

# ##Inverse transform one hot encoded
# # inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0,:])])
# # print(inverted)
In [29]:
#Label encode Category column in dataframe
labelencoder = LabelEncoder()
df_playstore_temp['Category'] = labelencoder.fit_transform(df_playstore_temp['Category'])
df_playstore_temp['Content Rating'] = labelencoder.fit_transform(df_playstore_temp['Content Rating'])
#drop the columns which are not required for scaling and modelling.
df_playstore_temp.drop(columns=['App','Genres','Last Updated','Current Ver','Android Ver'],axis=0,inplace=True)
df_playstore_temp.head()
Out[29]:
Category Rating Reviews Size Installs Price_in_dollars Content Rating Type_Free Type_Paid
0 0 4.1 159 19.0 10000 0.0 1 1 0
1 0 3.9 967 14.0 500000 0.0 1 1 0
2 0 4.7 87510 8.7 5000000 0.0 1 1 0
3 0 4.5 215644 25.0 50000000 0.0 4 1 0
4 0 4.3 967 2.8 100000 0.0 1 1 0
Final check and updates on missing values before applying scaling. If there are NaN values then scaling would fail. Ideally there should'nt be any NaN or missing values from the data cleaning steps above. May be removing the missing value rows could help.  
In [30]:
#fill-in column level mean values and replace nan uniformity in the dataframe.
df_playstore_temp.fillna(df_playstore_temp.mean(),inplace=True)
#Make all values numeric in dataframe before scaling
df_playstore_temp.apply(pd.to_numeric)
#find any nan values still present after replacing nan with column mean value.
np.where(np.isnan(df_playstore_temp))
Out[30]:
(array([], dtype=int64), array([], dtype=int64))

Scaling the numerical data

In [31]:
# Apply feature scaling on all values to the entire numerical dataframe.
scaled_features = StandardScaler().fit_transform(df_playstore_temp.values)
df_scaled_features = pd.DataFrame(scaled_features,index=df_playstore_temp.index,columns = df_playstore_temp.columns)
df_scaled_features.describe()
Out[31]:
Category Rating Reviews Size Installs Price_in_dollars Content Rating Type_Free Type_Paid
count 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03 9.659000e+03
mean 1.079973e-15 5.057210e-16 1.034994e-16 -3.126116e-16 1.206516e-15 1.280889e-15 2.762345e-15 2.989430e-15 2.049700e-15
std 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00 1.000052e+00
min -1.995182e+00 -6.419841e+00 -1.182774e-01 -9.997019e-01 -1.446830e-01 -6.523535e-02 -1.456013e+00 -3.429224e+00 -2.914021e-01
25% -6.857044e-01 -3.504909e-01 -1.182638e-01 -7.402240e-01 -1.446644e-01 -6.523535e-02 -4.427831e-01 2.916112e-01 -2.914021e-01
50% -3.285741e-01 5.413244e-02 -1.177494e-01 -2.155307e-01 -1.428227e-01 -6.523535e-02 -4.427831e-01 2.916112e-01 -2.914021e-01
75% 9.809034e-01 6.610674e-01 -1.022221e-01 2.258001e-01 -1.260803e-01 -6.523535e-02 -4.427831e-01 2.916112e-01 -2.914021e-01
max 1.814207e+00 1.672626e+00 4.256261e+01 3.903556e+00 1.845807e+01 2.367183e+01 3.610135e+00 2.916112e-01 3.431684e+00

PCA feature extraction

In [32]:
#function to apply PCA feature scaling
def scree_plot(pca):
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instantian of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components = len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(25, 10))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    print (ind, cumvals)
    for i in range(num_components):
        ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
     
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    
In [33]:
# Apply PCA to the data with for all features
pca = PCA(n_components=9)
pca_scaled_features = pca.fit_transform(scaled_features)
scree_plot(pca)
[0 1 2 3 4 5 6 7 8] [0.23470125 0.41830264 0.55699095 0.66729105 0.77006462 0.86746425
 0.95836796 0.99992036 1.        ]
In [62]:
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
    """
    Docstring- map the weights and components from PCA analysis.
    Input parameters-
    Input - pca initialized model
    ix = index number of first set of components.
    dataset = a dataframe of scaled features
    """
    a1 = pca.components_[ix]
    a2 = dataset.keys().values
    a = list(zip(a1, a2))
    a.sort(key=lambda tup: tup[0])
    return a
In [63]:
sorted_weights(pca,1,df_scaled_features)
Out[63]:
[(-0.12229910905176003, 'Type_Free'),
 (-0.006965205700940786, 'Category'),
 (0.052776060545864194, 'Price_in_dollars'),
 (0.12230436804321762, 'Type_Paid'),
 (0.13392012871685735, 'Rating'),
 (0.1488315583383376, 'Content Rating'),
 (0.1913971265965797, 'Size'),
 (0.6631655008104139, 'Installs'),
 (0.6713746849993583, 'Reviews')]
In [36]:
sorted_weights(pca,2,df_scaled_features)
Out[36]:
[(-0.5694149133141971, 'Size'),
 (-0.541988498849535, 'Content Rating'),
 (-0.21127969916573552, 'Rating'),
 (-0.028721963191943947, 'Type_Paid'),
 (0.006696511389931445, 'Price_in_dollars'),
 (0.028953514851547082, 'Type_Free'),
 (0.15393357631104168, 'Reviews'),
 (0.1884289484379903, 'Installs'),
 (0.5258011956515887, 'Category')]
In [37]:
sorted_weights(pca,3,df_scaled_features)
Out[37]:
[(-0.8812904230194594, 'Rating'),
 (-0.09345087354370417, 'Category'),
 (-0.03562384899857902, 'Type_Paid'),
 (0.03558474197893066, 'Type_Free'),
 (0.03804603155380957, 'Reviews'),
 (0.052252220680459364, 'Installs'),
 (0.058975573719736885, 'Size'),
 (0.2285083375432296, 'Content Rating'),
 (0.3901112896899358, 'Price_in_dollars')]

PCA Analysis results-

NOTE -

PCA analysis shows that 8 components have variance between 23.4% to 4.15% which would affect the prediction of outcomes.

As thumb rule, positive or negative variance of component above 0.5 is deemed to affect the preduction. Hence in first set of features selection, Reviews and Installation numbers play a big role in predicting which category of apps would recieve reviews and higher number of installations.

In the second set of components, Category, Installs, Size and Content Rating variance is inverserly proportional. It implies that Size of App and number of installs are dependent and customers prefer to install low size apps.


Third set of components shows App Price and ratings have strong inverse relationship. Customer who pay who provide higher Ratings install apps which are cheaper.

KMeans clustering Analysis

KMean clustering of dataset shows that there are three clusters of data in majority. I have used KMeans clustering to separate clusters of data within the dataset based on PCA analysis but it seems that the data is not clearly separable and hands around 3 major clusters.
In [38]:
def plot_data(data, labels):
    '''
    Plot data with colors associated with labels
    '''
    fig = plt.figure();
    ax = Axes3D(fig)
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10');
In [39]:
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=15)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
In [40]:
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=7)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
In [41]:
df_playstore_temp.head()
Out[41]:
Category Rating Reviews Size Installs Price_in_dollars Content Rating Type_Free Type_Paid
0 0 4.1 159 19.0 10000 0.0 1 1 0
1 0 3.9 967 14.0 500000 0.0 1 1 0
2 0 4.7 87510 8.7 5000000 0.0 1 1 0
3 0 4.5 215644 25.0 50000000 0.0 4 1 0
4 0 4.3 967 2.8 100000 0.0 1 1 0

Modelling

Use Linear Regression model to predict the following business questions-
1. Predict Ratings of apps for all categories.
2. Predict Pricing of apps.
3. Predict pricing in relation to number of installations
Case One- Target Label = 'Rating'
In [42]:
#Split the dataset into features and target labels. Rating column is the target label.
Target_label = df_playstore_temp['Rating']
features_label = df_playstore_temp.drop('Rating',axis=1)

# Split the features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_label,Target_label,test_size = 0.2,random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

#Train using LinearRegression model
lm_model = LinearRegression(normalize=True)

# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_model.fit(X_train, y_train)
except:
    print("Oh no! It doesn't work!!!")
    

y_test_preds = lm_model.predict(X_test)# Predictions here
r2_test = r2_score(y_test, y_test_preds) # Rsquared here

# Print r2 to see result
print('R Squared value of the predicted labels: ' + str(r2_test))
coefficients = list(zip(lm_model.coef_,X_test))
#print(coefficients)
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test,y_test_preds)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test,y_test_preds)))
print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test,y_test_preds)))
Training set has 7727 samples.
Testing set has 1932 samples.
R Squared value of the predicted labels: 0.012492315518284114
Mean Squared Error: 0.24072991323851956
Mean absolute Error: 0.32423723909300156
Mean squared Log Error: 0.011939118948726549
In [43]:
# Linear regression coefficients for Ratings as target labels.
coefficients
Out[43]:
[(-0.0018024997071665536, 'Category'),
 (1.3819674820377472e-08, 'Reviews'),
 (0.0010433895070699137, 'Size'),
 (1.0710294006119369e-10, 'Installs'),
 (-0.0007893641940442483, 'Price_in_dollars'),
 (0.004981703040734751, 'Content Rating'),
 (0.0015468752266353989, 'Type_Free'),
 (0.08451990649932442, 'Type_Paid')]
In [44]:
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
plt.legend()
plt.title('Linear Regression model- App Ratings')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()
No handles with labels found to put in legend.
Case Two- Target Label = 'Price_in_dollars'
In [45]:
#Split the dataset into features and target labels. Rating column is the target label.
Target_label1 = df_scaled_features['Price_in_dollars']
features_label1 = df_scaled_features.drop('Price_in_dollars',axis=1)

# Split the features into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(features_label1,Target_label1,test_size = 0.2,random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train1.shape[0]))
print("Testing set has {} samples.".format(X_test1.shape[0]))

#Train using LinearRegression model
lm_model1 = LinearRegression(normalize=True)

# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_model1.fit(X_train1, y_train1)
except:
    print("Oh no! It doesn't work!!!")
    

y_test_preds1 = lm_model1.predict(X_test1)# Predictions here
r2_test1 = r2_score(y_test1, y_test_preds1) # Rsquared here

# Print r2 to see result
coefficients1 = list(zip(lm_model1.coef_,X_test1))
print('R Squared value of the predicted labels: ' + str(r2_test1))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test1,y_test_preds1)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test1,y_test_preds1)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test1,y_test_preds1)))
Training set has 7727 samples.
Testing set has 1932 samples.
R Squared value of the predicted labels: 0.051591060661092536
Mean Squared Error: 1.9064629909990272
Mean absolute Error: 0.13830683321673282
In [46]:
# Linear regression coefficients for Price as target labels.
coefficients1
Out[46]:
[(-0.011946421241898689, 'Category'),
 (-0.019292765896548807, 'Rating'),
 (0.001969853668919766, 'Reviews'),
 (-0.012532837632746823, 'Size'),
 (0.0006992635413939693, 'Installs'),
 (0.0006528848275836025, 'Content Rating'),
 (0.0021467979800262327, 'Type_Free'),
 (0.19030847314185675, 'Type_Paid')]
In [47]:
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
#sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
sns.regplot(y_test1,y_test_preds1,color='teal',marker='x')
plt.legend()
plt.title('Linear Regression model- predicted price vs actual price')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.show()
No handles with labels found to put in legend.
Case Three- Target Label = 'Installs'
In [48]:
#Split the dataset into features and target labels.'Installs'column is the target label.
Target_label2 = df_scaled_features['Installs']
features_label2 = df_scaled_features.drop('Installs',axis=1)

# Split the features into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(features_label2,Target_label2,test_size = 0.2,random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train2.shape[0]))
print("Testing set has {} samples.".format(X_test2.shape[0]))

#Train using LinearRegression model
lm_model2 = LinearRegression(normalize=True)

# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_model2.fit(X_train2, y_train2)
except:
    print("Oh no! It doesn't work!!!")
    
#Predict function
y_test_preds2 = lm_model2.predict(X_test2)# Predictions here
#R square function
r2_test2 = r2_score(y_test2, y_test_preds2) # Rsquared here

# Print r2 to see result
coefficients2 = list(zip(lm_model2.coef_,X_test2))
print('R Squared value of the predicted labels: ' + str(r2_test2))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test2,y_test_preds2)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test2,y_test_preds2)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test2,y_test_preds2)))
Training set has 7727 samples.
Testing set has 1932 samples.
R Squared value of the predicted labels: 0.43554084271547755
Mean Squared Error: 0.4897947759940874
Mean absolute Error: 0.1285100215661726
In [49]:
coefficients2
Out[49]:
[(0.025157112715526838, 'Category'),
 (0.0075126277466649296, 'Rating'),
 (0.6627852226270038, 'Reviews'),
 (-0.0013543417036387094, 'Size'),
 (0.0006291066872166958, 'Price_in_dollars'),
 (0.013286188837415875, 'Content Rating'),
 (0.01776075161244003, 'Type_Free'),
 (-0.003395519959783328, 'Type_Paid')]
In [50]:
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds1,y_test1,color='teal',marker = 'x')
sns.regplot(y_test_preds2,y_test2,color='orange')
plt.legend()
plt.title('Linear Regression model- Predicted Price vs Installs')
plt.xlabel('Predicted Installs')
plt.ylabel('Predicted Price')
plt.show()
No handles with labels found to put in legend.

Evaluation and Analysis of results

Evaluate the mean squared error MSE- 
    Case One- Target Label = 'Rating' (1.3819674820377472e-08, 'Reviews'), (1.0710294006119369e-10,'Installs')
    with 'Rating' as target label, model has predicted with much higher coefficients of Reviews and Installs clearly because of high biases which does not show a correct picture.Category values should have been weighed higher -  (-0.0018024997071665536, 'Category'). To me this looks like a misprediction due to biased values.


Case Two- Target Label = 'Price_in_dollars' - Mean Squared Error: 1.9064629909990272, MSE is not closer to 0 and highest in comparision to other target labels but the coefficients are equally distributed and it seems that there is no bias introduced during prediction. With maximum value of coefficient (0.19030847314185675, 'Type_Paid'), Linear Regression defines Price has an important feature for App selection. Which sould rational too.


Case Three- Target Label = 'Installs' - Mean Squared Error: 0.4897947759940874, MSE is close to Zero which is a good sign and coefficients have correctly identified the weights of features as (0.6627852226270038, 'Reviews')(0.025157112715526838, 'Category'). This shows that Linear Regression model could predict success of an App better if the target label is set to Installs.

It is even interesting to see corelation between predicted price and installs.

Predicted Price vs Predicted Installs

Linear Regression model shows that predicted price and number of installs are proportional and tends to increase together. (Small Blue line is predicted price and the large Orange line is predicted Installs).   Form Violin plot at the end it would be clear that for certain categories of Apps on playstore, consumers pay for the App.



"An MSE of zero, meaning that the estimator predicts observations of the parameter with perfect accuracy, is the ideal, but is typically not possible."

Reference link- https://en.wikipedia.org/wiki/Mean_squared_error

Data Analysis - App pricing distribution trend across categories

Apps in cateogories like Events, lifestyle, Games, Books and libraries have free to highest prices. Whereas Books and references, Dating, Finance, Health and Fitness, Gaming, Family, Medical apps have lower prices.

Apps in categories like Game, Lifestyle, Family, Medical, Travel and local, productivity, Health and fitness get paid for with higher number of installations.
In [51]:
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Price_in_dollars", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across all categories')
In [52]:
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Installs", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App Installation trend across all categories')

Corelation between features of App store data

Reviews and Installs have strong corelation which is rational in the given dataset.

In [53]:
#Corelation plot for numerical encoded version of playstore data.
sns.heatmap(df_playstore_temp.corr(),annot=True,fmt='.2f')
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x17c61473e80>

Histogram for scaled and numerical version Playstore dataset.

In [54]:
df_playstore_temp.hist(figsize=(15,10))
Out[54]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62BAC908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62C6BE80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62C97898>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62CC72B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62CEBD30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62CEBD68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62D4C2B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62D70D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000017C62DA07F0>]],
      dtype=object)

Free Or Paid Apps vs Rating

Apps which are paid for have got better rating and more number of Installs.

In [55]:
#function with docstring to plot comparision charts between various features of dataset 
def func_plotjoint(a,b,color='g'):
    """Function to plot seaborn jointplot chart
    Input parameters - 
    x= x-axis labels with dataframe column
    y = y-axis labels with dataframe column
    color = 'r' for red, 'c' for cyan, default is green
    """
    sns.jointplot(x=a,y=b,color=color)
In [56]:
#Plot charts between different important features from modelling and PCA analysis. 
func_plotjoint(df_playstore['Installs'],df_playstore['Rating'])
func_plotjoint(df_playstore['Price_in_dollars'],df_playstore['Rating'],color='r')
func_plotjoint(df_playstore['Installs'],df_playstore['Price_in_dollars'],color='c')
In [57]:
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 50).reset_index()
print('Average rating = ', np.nanmean(list(groups.Rating)))

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]

layout = {'title' : 'App rating distribution and probability density for all categories',
        'xaxis': {'tickangle':-40},
        'yaxis': {'title': 'Rating'},
          'plot_bgcolor': 'rgb(250,250,250)',
          'shapes': [{
              'type' :'line',
              'x0': -.5,
              'y0': np.nanmean(list(groups.Rating)),
              'x1': 19,
              'y1': np.nanmean(list(groups.Rating)),
              'line': { 'dash': 'dashdot'}
          }]
          }

data = [{
    'y': df_playstore.loc[df_playstore.Category==category]['Rating'],
    'type':'violin',
    'name' : category,
    'showlegend':False,
    #'marker': {'color': 'Set2'},
    } for i,category in enumerate(list(set(groups.Category)))]


from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Average rating =  4.173243045387995

Average rating for all App Categories

This chart shows median value of Ratings per Category together with Max Rating received and minimum Rating. This chart also shows combination of ratings distribution and probability density per category.

Tools, Productivity, Finance, Category- has range of ratings from 1 to 5

Assuming that Rating is equivalent to popularity, and width and length of curved area of the graph implies that Comics, Health & Fitness,Parenting, Art and Design,

Reference- https://en.wikipedia.org/wiki/Violin_plot https://en.wikipedia.org/wiki/Kernel_density_estimation http://seaborn.pydata.org/generated/seaborn.violinplot.html

In [58]:
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 300).reset_index()
print('Average rating = ', np.nanmean(list(groups.Price_in_dollars)))

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]

layout = {'title' : 'App Price distribution and probability density for top 10 categories',
        'xaxis': {'tickangle':-40},
        'yaxis': {'title': 'Price_in_dollars'},
          'plot_bgcolor': 'rgb(250,250,250)',
          'shapes': [{
              'type' :'line',
              'x0': -.5,
              'y0': np.nanmean(list(groups.Price_in_dollars)),
              'x1': 19,
              'y1': np.nanmean(list(groups.Price_in_dollars)),
              'line': { 'dash': 'dashdot'}
          }]
          }

data = [{
    'y': df_playstore.loc[df_playstore.Category==category]['Price_in_dollars'],
    'type':'violin',
    'name' : category,
    'showlegend':False,
    #'marker': {'color': 'Set2'},
    } for i,category in enumerate(list(set(groups.Category)))]


from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Average rating =  1.5249594615266941

User Review Data

Define-
    1. What do users think about the Apps? Keywords?
    2. Sentiment Analysis.
In [59]:
df_reviews.head()
Out[59]:
App Translated_Review Sentiment Sentiment_Polarity Sentiment_Subjectivity
0 10 Best Foods for You I like eat delicious food. That's I'm cooking ... Positive 1.00 0.533333
1 10 Best Foods for You This help eating healthy exercise regular basis Positive 0.25 0.288462
2 10 Best Foods for You NaN NaN NaN NaN
3 10 Best Foods for You Works great especially going grocery store Positive 0.40 0.875000
4 10 Best Foods for You Best idea us Positive 1.00 0.300000
A wordcloud of most frequently used words used in User reviews by the users.
In [60]:
df_reviews['Translated_Review'].replace(np.NaN,'',inplace=True)
review = np.array(df_reviews['Translated_Review'].dropna)

# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=100, background_color='white').generate(str(review))

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()
<Figure size 2880x2160 with 0 Axes>
A wordcloud of most frequently used words used for App names.
In [61]:
df_reviews['App'].replace(np.NaN,'',inplace=True)
Appname = np.array(df_reviews.drop_duplicates('App').dropna())
# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=200, background_color='white').generate(str(Appname))

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()
<Figure size 2880x2160 with 0 Axes>